In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import FunctionTransformer
from scipy.stats import chi2_contingency
In [2]:
def read_data(path):
    df = pd.read_csv(path)
    print(df.columns)
    print()
    print("the shape is: ", df.shape)
    return df

Attendance¶

In [3]:
attendance = read_data('attendance.csv')
attendance.head()
Index(['home_team', 'away_team', 'Attendance', 'Date'], dtype='object')

the shape is:  (964, 4)
Out[3]:
home_team away_team Attendance Date
0 Argentina France 88966 12/18/2022
1 Croatia Morocco 44137 12/17/2022
2 France Morocco 68294 12/14/2022
3 Argentina Croatia 88966 12/13/2022
4 Morocco Portugal 44198 12/10/2022

Matches¶

In [4]:
matches = read_data('worldcup-1.1.0/data-csv/matches.csv')
matches[['stadium_id', 'home_team_name', 'away_team_name', 'match_date']].head()
Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
       'stage_name', 'group_name', 'group_stage', 'knockout_stage', 'replayed',
       'replay', 'match_date', 'match_time', 'stadium_id', 'stadium_name',
       'city_name', 'country_name', 'home_team_id', 'home_team_name',
       'home_team_code', 'away_team_id', 'away_team_name', 'away_team_code',
       'score', 'home_team_score', 'away_team_score', 'home_team_score_margin',
       'away_team_score_margin', 'extra_time', 'penalty_shootout',
       'score_penalties', 'home_team_score_penalties',
       'away_team_score_penalties', 'result', 'home_team_win', 'away_team_win',
       'draw'],
      dtype='object')

the shape is:  (964, 37)
Out[4]:
stadium_id home_team_name away_team_name match_date
0 S-193 France Mexico 7/13/1930
1 S-192 United States Belgium 7/13/1930
2 S-192 Yugoslavia Brazil 7/14/1930
3 S-193 Romania Peru 7/14/1930
4 S-192 Argentina France 7/15/1930

Stadiums¶

In [5]:
stadiums = read_data('worldcup-1.1.0/data-csv/stadiums.csv')
stadiums.head()
Index(['key_id', 'stadium_id', 'stadium_name', 'city_name', 'country_name',
       'stadium_capacity', 'stadium_wikipedia_link', 'city_wikipedia_link'],
      dtype='object')

the shape is:  (193, 8)
Out[5]:
key_id stadium_id stadium_name city_name country_name stadium_capacity stadium_wikipedia_link city_wikipedia_link
0 1 S-001 Estadio José Amalfitani Buenos Aires Argentina 49000 https://en.wikipedia.org/wiki/José_Amalfitani_... https://en.wikipedia.org/wiki/Buenos_Aires
1 2 S-002 Estadio Monumental Buenos Aires Argentina 75000 https://en.wikipedia.org/wiki/Estadio_Monument... https://en.wikipedia.org/wiki/Buenos_Aires
2 3 S-003 Estadio Chateau Carreras Córdoba Argentina 47000 https://en.wikipedia.org/wiki/Estadio_Mario_Al... https://en.wikipedia.org/wiki/Córdoba,_Argentina
3 4 S-004 Estadio José María Minella Mar del Plata Argentina 44000 https://en.wikipedia.org/wiki/Estadio_José_Mar... https://en.wikipedia.org/wiki/Mar_del_Plata
4 5 S-005 Estadio Ciudad de Mendoza Mendoza Argentina 35000 https://en.wikipedia.org/wiki/Estadio_Malvinas... https://en.wikipedia.org/wiki/Mendoza,_Argentina

Merging Matches with Stadiums¶

In [6]:
matches = pd.merge(
    matches,
    stadiums[['stadium_id', 'stadium_capacity']],
    on='stadium_id',
    how='inner'
)
matches = matches.sort_values('key_id').reset_index(drop=True)
matches.head()
Out[6]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... extra_time penalty_shootout score_penalties home_team_score_penalties away_team_score_penalties result home_team_win away_team_win draw stadium_capacity
0 1 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico group stage Group 1 1 0 0 ... 0 0 0-0 0 0 home team win 1 0 0 10000
1 2 WC-1930 1930 FIFA World Cup M-1930-02 United States v Belgium group stage Group 4 1 0 0 ... 0 0 0-0 0 0 home team win 1 0 0 20000
2 3 WC-1930 1930 FIFA World Cup M-1930-03 Yugoslavia v Brazil group stage Group 2 1 0 0 ... 0 0 0-0 0 0 home team win 1 0 0 20000
3 4 WC-1930 1930 FIFA World Cup M-1930-04 Romania v Peru group stage Group 3 1 0 0 ... 0 0 0-0 0 0 home team win 1 0 0 10000
4 5 WC-1930 1930 FIFA World Cup M-1930-05 Argentina v France group stage Group 1 1 0 0 ... 0 0 0-0 0 0 home team win 1 0 0 20000

5 rows × 38 columns

In [7]:
matches.shape
Out[7]:
(964, 38)

عند الدمج مع الحضور تم ملاحظة أن عدد الأسطر بعد الدمج لا يساوي عدد الأسطر قبل الدمج

لذلك تم إنشاء تابع يقوم بإيجاد الفرق وفقا للأعمدة المعطاة في كل مرة

In [8]:
def check_differences(matches_lookups, attendance_lookups):
    df1 = matches[[lookup for lookup in matches_lookups]]
    s1 = {tuple(x) for x in df1.to_numpy()}

    df2 = attendance[[lookup for lookup in attendance_lookups]]
    s2 = {tuple(x) for x in df2.to_numpy()}

    return s1 - s2, s2 - s1

أولاً تفقد أسماء الفرق في الذهاب والإياب

In [9]:
check_differences(['home_team_name'], ['home_team'])
Out[9]:
({('China',),
  ('East Germany',),
  ('Iran',),
  ('Ivory Coast',),
  ('North Korea',),
  ('South Korea',),
  ('Turkey',)},
 {('China PR',),
  ("Côte d'Ivoire",),
  ('FR Yugoslavia',),
  ('Germany DR',),
  ('IR Iran',),
  ('Korea DPR',),
  ('Korea Republic',),
  ('Türkiye',)})
In [10]:
check_differences(['away_team_name'], ['away_team'])
Out[10]:
({('China',),
  ('East Germany',),
  ('Iran',),
  ('Ivory Coast',),
  ('North Korea',),
  ('South Korea',),
  ('Turkey',)},
 {('China PR',),
  ("Côte d'Ivoire",),
  ('FR Yugoslavia',),
  ('Germany DR',),
  ('IR Iran',),
  ('Korea DPR',),
  ('Korea Republic',),
  ('Türkiye',)})

هناك اختلاف بأسماء بعض الفرق، سنقوم بتوحيد الأسماء في كلا الجدولين

ملاحظة: في كلا الجدولين يوجد بعض المباريات التي فيها West Germany, Germany ولكن لأجل نفس المباراة قد يحوي الجدول الأول Germany بينما الجدول الثاني West Germany

In [11]:
print(
    attendance['home_team'][attendance['home_team'] == 'West Germany'].count()
    + attendance['home_team'][attendance['home_team'] == 'Germany'].count()
    + attendance['away_team'][attendance['away_team'] == 'West Germany'].count()
    + attendance['away_team'][attendance['away_team'] == 'Germany'].count()
)
print(
    matches['home_team_name'][matches['home_team_name']
                              == 'West Germany'].count()
    + matches['home_team_name'][matches['home_team_name'] == 'Germany'].count()
    + matches['away_team_name'][matches['away_team_name']
                                == 'West Germany'].count()
    + matches['away_team_name'][matches['away_team_name'] == 'Germany'].count()
)
112
112

هناك 7 دول لم تعد موجودة، وقامت الفيفا بتحديد من يأخذ سجلات الفريق الذي لم يعد موجودا

سيتم استعراض هذه الدول في القسم الثاني من القاموس التالي

https://www.dailystar.co.uk/sport/football/countries-not-exist-world-cup-28504115

In [12]:
country_map = {
    'China PR': 'China',
    'Côte d\'Ivoire': 'Ivory Coast',
    'IR Iran': 'Iran',
    'Türkiye': 'Turkey',
    'Germany DR': 'East Germany',
    'Korea DPR': 'North Korea',
    'Korea Republic': 'South Korea',

    # united/separated countries with countries that inherited their records according to FIFA
    'Soviet Union': 'Russia',
    'West Germany': 'Germany',  # Germany only inherited West Germany's record, not East Germany
    'FR Yugoslavia': 'Serbia',
    'Yugoslavia': 'Serbia',
    'Czechoslovakia': 'Czech Republic',
    'Dutch East Indies': 'Indonesia',
    'Zaire': 'DR Congo'
}

توحيد الأسماء في كلا الجدولين

In [13]:
def unify_teams_names(data, columns):
    for column in columns:
        data[column] = data[column].apply(lambda x: country_map.get(x, x))


def unify_matches_names(data, columns, delimiter=' v '):
    for column in columns:
        data[column] = data[column].apply(
            lambda x: delimiter.join(country_map.get(x, x) for x in x.split(delimiter))
        )
In [14]:
unify_teams_names(attendance, ['home_team', 'away_team'])
unify_teams_names(matches, ['home_team_name', 'away_team_name'])
unify_matches_names(matches, ['home_team_name', 'away_team_name'])

ثانياً تفقد تاريخ المباريات: لا يوجد فروقات

In [15]:
check_differences(['match_date'], ['Date'])
Out[15]:
(set(), set())

رغم كل ما سبق، لا زال يوجد اختلافات لذلك سننظر للثلاث أعمدة في نفس الوقت

In [16]:
diff1, diff2 = check_differences(
    ['home_team_name', 'away_team_name', 'match_date'],
    ['home_team', 'away_team', 'Date']
)
In [17]:
diff1 = list(diff1)
diff1.sort(key=lambda x: x[2])
diff1
Out[17]:
[('Argentina', 'Italy', '6/10/1978'),
 ('Russia', 'Uruguay', '6/14/1970'),
 ('Italy', 'Germany', '6/14/1978'),
 ('Austria', 'Netherlands', '6/14/1978'),
 ('Peru', 'Poland', '6/18/1978'),
 ('Netherlands', 'Germany', '6/18/1978'),
 ('England', 'Switzerland', '6/20/1954'),
 ('Italy', 'Netherlands', '6/21/1978'),
 ('Spain', 'Northern Ireland', '6/25/1982'),
 ('Austria', 'Switzerland', '6/26/1954'),
 ('Belgium', 'France', '6/28/1986'),
 ('Brazil', 'Sweden', '6/3/1978'),
 ('Republic of Ireland', 'Italy', '6/30/1990'),
 ('Argentina', 'Serbia', '6/30/1990'),
 ('Czech Republic', 'Germany', '7/1/1990'),
 ('Cameroon', 'England', '7/1/1990'),
 ('Chile', 'North Korea', '7/15/1966'),
 ('Argentina', 'Germany', '7/16/1966'),
 ('Mexico', 'Uruguay', '7/19/1966'),
 ('Argentina', 'Italy', '7/3/1990'),
 ('Russia', 'Poland', '7/4/1982')]
In [18]:
diff2 = list(diff2)
diff2.sort(key=lambda x: x[2])
diff2
Out[18]:
[('Italy', 'Argentina', '6/10/1978'),
 ('Uruguay', 'Russia', '6/14/1970'),
 ('Germany', 'Italy', '6/14/1978'),
 ('Netherlands', 'Austria', '6/14/1978'),
 ('Poland', 'Peru', '6/18/1978'),
 ('Germany', 'Netherlands', '6/18/1978'),
 ('Switzerland', 'England', '6/20/1954'),
 ('Netherlands', 'Italy', '6/21/1978'),
 ('Northern Ireland', 'Spain', '6/25/1982'),
 ('Switzerland', 'Austria', '6/26/1954'),
 ('France', 'Belgium', '6/28/1986'),
 ('Sweden', 'Brazil', '6/3/1978'),
 ('Italy', 'Republic of Ireland', '6/30/1990'),
 ('Serbia', 'Argentina', '6/30/1990'),
 ('England', 'Cameroon', '7/1/1990'),
 ('Germany', 'Czech Republic', '7/1/1990'),
 ('North Korea', 'Chile', '7/15/1966'),
 ('Germany', 'Argentina', '7/16/1966'),
 ('Uruguay', 'Mexico', '7/19/1966'),
 ('Italy', 'Argentina', '7/3/1990'),
 ('Poland', 'Russia', '7/4/1982')]

نلاحظ أن الفروقات ناتجة عن تبديل فرق الذهاب والإياب لبعض المباريات لذلك سنقوم بعكسهم

In [19]:
swapped_attendance = attendance[
    attendance[
        ['home_team', 'away_team', 'Date']
    ].apply(tuple, 1).isin(diff2)
]

swapped_attendance.head()
Out[19]:
home_team away_team Attendance Date
503 Italy Argentina 59978 7/3/1990
504 Germany Czech Republic 73347 7/1/1990
505 England Cameroon 55205 7/1/1990
506 Serbia Argentina 38971 6/30/1990
507 Italy Republic of Ireland 73303 6/30/1990
In [20]:
attendance.loc[
    swapped_attendance.index,
    ['home_team', 'away_team']
] = attendance.loc[
    swapped_attendance.index,
    ['away_team', 'home_team']
].values

attendance[500:505]
Out[20]:
home_team away_team Attendance Date
500 Germany Argentina 73603 7/8/1990
501 Italy England 51426 7/7/1990
502 Germany England 62628 7/4/1990
503 Argentina Italy 59978 7/3/1990
504 Czech Republic Germany 73347 7/1/1990
In [21]:
matches = pd.merge(
    matches, attendance,
    left_on=['home_team_name', 'away_team_name', 'match_date'],
    right_on=['home_team', 'away_team', 'Date'],
    how='inner'
)
matches.head()
Out[21]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... away_team_score_penalties result home_team_win away_team_win draw stadium_capacity home_team away_team Attendance Date
0 1 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico group stage Group 1 1 0 0 ... 0 home team win 1 0 0 10000 France Mexico 4444 7/13/1930
1 2 WC-1930 1930 FIFA World Cup M-1930-02 United States v Belgium group stage Group 4 1 0 0 ... 0 home team win 1 0 0 20000 United States Belgium 18346 7/13/1930
2 3 WC-1930 1930 FIFA World Cup M-1930-03 Yugoslavia v Brazil group stage Group 2 1 0 0 ... 0 home team win 1 0 0 20000 Serbia Brazil 24059 7/14/1930
3 4 WC-1930 1930 FIFA World Cup M-1930-04 Romania v Peru group stage Group 3 1 0 0 ... 0 home team win 1 0 0 10000 Romania Peru 2549 7/14/1930
4 5 WC-1930 1930 FIFA World Cup M-1930-05 Argentina v France group stage Group 1 1 0 0 ... 0 home team win 1 0 0 20000 Argentina France 23409 7/15/1930

5 rows × 42 columns

تابع للتفقد من وجود قيم فارغة ضمن عمود، والأخر على مستوى جميع الأعمدة

In [22]:
def check_na(data, column):
    num_nan = data[column].isna().sum()
    if num_nan:
        raise Exception(f"There is {num_nan} nan values in {column}")


def check_all_na(data):
    for column in data.columns:
        check_na(data, column)
In [23]:
check_all_na(matches)
matches.shape
Out[23]:
(964, 42)
In [24]:
shape_before_drop = matches.shape
matches.drop_duplicates(inplace=True)
shape_before_drop == matches.shape
Out[24]:
True

Players¶

In [25]:
players = read_data('worldcup-1.1.0/data-csv/players.csv')
players.head()
Index(['key_id', 'player_id', 'family_name', 'given_name', 'birth_date',
       'goal_keeper', 'defender', 'midfielder', 'forward', 'count_tournaments',
       'list_tournaments', 'player_wikipedia_link'],
      dtype='object')

the shape is:  (8485, 12)
Out[25]:
key_id player_id family_name given_name birth_date goal_keeper defender midfielder forward count_tournaments list_tournaments player_wikipedia_link
0 1 P-08891 A'Court Alan 1934-09-30 0 0 0 1 1 1958 https://en.wikipedia.org/wiki/Alan_A%27Court
1 2 P-08589 Aaronson Brenden 2000-10-22 0 0 0 1 1 2022 https://en.wikipedia.org/wiki/Brenden_Aaronson
2 3 P-04897 Abadzhiev Stefan 1934-07-03 0 0 0 1 1 1966 https://en.wikipedia.org/wiki/Stefan_Abadzhiev
3 4 P-05556 Abalo Jean-Paul 1975-06-26 0 1 0 0 1 2006 https://en.wikipedia.org/wiki/Jean-Paul_Abalo
4 5 P-08163 Abanda Patrice 1978-08-03 0 1 0 0 1 1998 https://en.wikipedia.org/wiki/Patrice_Abanda

Squads¶

In [26]:
squads = read_data('worldcup-1.1.0/data-csv/squads.csv')
squads[squads['team_name'] == 'Russia'].head()
Index(['key_id', 'tournament_id', 'tournament_name', 'team_id', 'team_name',
       'team_code', 'player_id', 'family_name', 'given_name', 'shirt_number',
       'position_name', 'position_code'],
      dtype='object')

the shape is:  (10973, 12)
Out[26]:
key_id tournament_id tournament_name team_id team_name team_code player_id family_name given_name shirt_number position_name position_code
5603 5604 WC-1994 1994 FIFA World Cup T-60 Russia RUS P-09869 Cherchesov Stanislav 1 goal keeper GK
5604 5605 WC-1994 1994 FIFA World Cup T-60 Russia RUS P-03790 Kuznetsov Dmitri 2 defender DF
5605 5606 WC-1994 1994 FIFA World Cup T-60 Russia RUS P-06666 Gorlukovich Sergei 3 defender DF
5606 5607 WC-1994 1994 FIFA World Cup T-60 Russia RUS P-04883 Galiamin Dmitri 4 defender DF
5607 5608 WC-1994 1994 FIFA World Cup T-60 Russia RUS P-02169 Nikiforov Yuri 5 defender DF
سيتم معالجة أكواد الفرق والـids لتلائم توحيد أسماء الفرق
In [27]:
teams_codes = dict(zip(squads['team_name'], squads['team_code']))
teams_codes['Indonesia'] = teams_codes['Dutch East Indies']
teams_codes['DR Congo'] = teams_codes['Zaire']
list(teams_codes.items())[:5]
Out[27]:
[('Argentina', 'ARG'),
 ('Belgium', 'BEL'),
 ('Bolivia', 'BOL'),
 ('Brazil', 'BRA'),
 ('Chile', 'CHL')]
In [28]:
teams_ids = dict(zip(squads['team_name'], squads['team_id']))
teams_ids['Indonesia'] = teams_ids['Dutch East Indies']
teams_ids['DR Congo'] = teams_ids['Zaire']
In [29]:
def unify_according_to_team_name(my_dict):

    def unify(data, columns):
        for column in columns:
            team_name, mapped_attribute = column
            data[mapped_attribute] = data[[team_name, mapped_attribute]].apply(
                lambda x:  my_dict[
                    country_map.get(x[team_name], x[team_name])
                ], axis=1
            )

    return unify


unify_teams_codes = unify_according_to_team_name(teams_codes)
unify_teams_ids = unify_according_to_team_name(teams_ids)
In [30]:
unify_teams_names(squads, ['team_name'])
unify_teams_codes(squads, [('team_name', 'team_code')])
unify_teams_ids(squads, [('team_name', 'team_id')])
squads[squads['team_name'] == 'Russia'].head()
Out[30]:
key_id tournament_id tournament_name team_id team_name team_code player_id family_name given_name shirt_number position_name position_code
1780 1781 WC-1958 1958 FIFA World Cup T-60 Russia RUS P-07374 Yashin Lev 1 goal keeper GK
1781 1782 WC-1958 1958 FIFA World Cup T-60 Russia RUS P-00011 Kesarev Vladimir 2 defender DF
1782 1783 WC-1958 1958 FIFA World Cup T-60 Russia RUS P-05169 Krizhevsky Konstantin 3 midfielder MF
1783 1784 WC-1958 1958 FIFA World Cup T-60 Russia RUS P-08404 Kuznetsov Boris 4 defender DF
1784 1785 WC-1958 1958 FIFA World Cup T-60 Russia RUS P-05220 Voinov Yuri 5 defender DF
In [31]:
unify_teams_codes(matches, [
    ('home_team_name', 'home_team_code'),
    ('away_team_name', 'away_team_code')
])
unify_teams_ids(matches, [
    ('home_team_name', 'home_team_id'),
    ('away_team_name', 'away_team_id')
])

Players Teams¶

In [32]:
players_teams = pd.merge(
    players[[
        'player_id', 'family_name', 'given_name',
        'count_tournaments', 'list_tournaments'
    ]],
    squads[['player_id', 'tournament_name', 'team_name', 'team_code']],
    on='player_id',
    how='inner'
)

players_teams.head()
Out[32]:
player_id family_name given_name count_tournaments list_tournaments tournament_name team_name team_code
0 P-08891 A'Court Alan 1 1958 1958 FIFA World Cup England ENG
1 P-08589 Aaronson Brenden 1 2022 2022 FIFA World Cup United States USA
2 P-04897 Abadzhiev Stefan 1 1966 1966 FIFA World Cup Bulgaria BGR
3 P-05556 Abalo Jean-Paul 1 2006 2006 FIFA World Cup Togo TGO
4 P-08163 Abanda Patrice 1 1998 1998 FIFA World Cup Cameroon CMR
In [33]:
check_all_na(players_teams)
players_teams.shape
Out[33]:
(10973, 8)
ملاحظة: تم تحقيق دمج أسماء الفرق والكود الخاص بها باستخدام الفاصلة للحفاظ على التنسيق المتبع، مع أنه من الأنسب استخدام المصفوفات تم الإبقاء على التكرارات في أسماء الفرق والأكواد لكي يتم البيان المنتخب الذي لعب معه اللاعب في كل بطولة
In [34]:
players_teams = players_teams.groupby('player_id', as_index=False).agg(
    {'player_id': 'first',
     'family_name': 'first',
     'given_name': 'first',
     'count_tournaments': 'first',
     'list_tournaments': 'first',
     'tournament_name': ', '.join,
     'team_name': ', '.join,
     'team_code': ', '.join
     })
In [35]:
players_teams['teams_num'] = players_teams['team_code'].apply(
    lambda x: len(set(x.split(', '))))

check_na(players_teams, 'teams_num')
players_teams.head()
Out[35]:
player_id family_name given_name count_tournaments list_tournaments tournament_name team_name team_code teams_num
0 P-00001 Schumacher Harald 2 1982, 1986 1982 FIFA World Cup, 1986 FIFA World Cup Germany, Germany DEU, DEU 1
1 P-00002 Mohammed Sharif Ismail 1 1986 1986 FIFA World Cup Iraq IRQ 1
2 P-00003 Otávio not applicable 1 2022 2022 FIFA World Cup Portugal PRT 1
3 P-00004 Cortés Julio César 3 1962, 1966, 1970 1962 FIFA World Cup, 1966 FIFA World Cup, 1970... Uruguay, Uruguay, Uruguay URY, URY, URY 1
4 P-00005 Vandersmissen Guy 1 1982 1982 FIFA World Cup Belgium BEL 1
In [36]:
players_teams.loc[players_teams['teams_num'] > 1].head()
Out[36]:
player_id family_name given_name count_tournaments list_tournaments tournament_name team_name team_code teams_num
455 P-00537 Wagner Franz 2 1934, 1938 1934 FIFA World Cup, 1938 FIFA World Cup Austria, Germany AUT, DEU 2
1080 P-01259 Puskás Ferenc 2 1954, 1962 1954 FIFA World Cup, 1962 FIFA World Cup Hungary, Spain HUN, ESP 2
1298 P-01512 Altafini José 2 1958, 1962 1958 FIFA World Cup, 1962 FIFA World Cup Brazil, Italy BRA, ITA 2
1490 P-01739 Šuker Davor 3 1990, 1998, 2002 1990 FIFA World Cup, 1998 FIFA World Cup, 2002... Serbia, Croatia, Croatia SRB, HRV, HRV 2
1502 P-01757 Raftl Rudolf 2 1934, 1938 1934 FIFA World Cup, 1938 FIFA World Cup Austria, Germany AUT, DEU 2

Tournaments¶

In [37]:
tournaments = read_data('worldcup-1.1.0/data-csv/tournaments.csv')
tournaments.head()
Index(['key_id', 'tournament_id', 'tournament_name', 'year', 'start_date',
       'end_date', 'host_country', 'winner', 'host_won', 'count_teams',
       'group_stage', 'second_group_stage', 'final_round', 'round_of_16',
       'quarter_finals', 'semi_finals', 'third_place_match', 'final'],
      dtype='object')

the shape is:  (22, 18)
Out[37]:
key_id tournament_id tournament_name year start_date end_date host_country winner host_won count_teams group_stage second_group_stage final_round round_of_16 quarter_finals semi_finals third_place_match final
0 1 WC-1930 1930 FIFA World Cup 1930 1930-07-13 1930-07-30 Uruguay Uruguay 1 13 1 0 0 0 0 1 0 1
1 2 WC-1934 1934 FIFA World Cup 1934 1934-05-27 1934-06-10 Italy Italy 1 16 0 0 0 1 1 1 1 1
2 3 WC-1938 1938 FIFA World Cup 1938 1938-06-04 1938-06-19 France Italy 0 15 0 0 0 1 1 1 1 1
3 4 WC-1950 1950 FIFA World Cup 1950 1950-06-24 1950-07-16 Brazil Uruguay 0 13 1 0 1 0 0 0 0 0
4 5 WC-1954 1954 FIFA World Cup 1954 1954-06-16 1954-07-04 Switzerland West Germany 0 16 1 0 0 0 1 1 1 1
In [38]:
unify_teams_names(tournaments, ['host_country', 'winner'])
check_all_na(tournaments)
In [39]:
matches = pd.merge(
    matches,
    tournaments[['tournament_id', 'host_country']],
    on='tournament_id',
)
In [40]:
check_all_na(matches)
matches.shape
Out[40]:
(964, 43)

Goals¶

In [41]:
goals = read_data('worldcup-1.1.0/data-csv/goals.csv')
goals[goals['player_team_name'] == 'Soviet Union'].head()
Index(['key_id', 'goal_id', 'tournament_id', 'tournament_name', 'match_id',
       'match_name', 'match_date', 'stage_name', 'group_name', 'team_id',
       'team_name', 'team_code', 'home_team', 'away_team', 'player_id',
       'family_name', 'given_name', 'shirt_number', 'player_team_id',
       'player_team_name', 'player_team_code', 'minute_label',
       'minute_regulation', 'minute_stoppage', 'match_period', 'own_goal',
       'penalty'],
      dtype='object')

the shape is:  (2720, 27)
Out[41]:
key_id goal_id tournament_id tournament_name match_id match_name match_date stage_name group_name team_id ... shirt_number player_team_id player_team_name player_team_code minute_label minute_regulation minute_stoppage match_period own_goal penalty
477 478 G-0478 WC-1958 1958 FIFA World Cup M-1958-08 Soviet Union v England 1958-06-08 group stage Group 4 T-70 ... 0 T-70 Soviet Union SUN 13' 13 0 first half 0 0
478 479 G-0479 WC-1958 1958 FIFA World Cup M-1958-08 Soviet Union v England 1958-06-08 group stage Group 4 T-70 ... 0 T-70 Soviet Union SUN 56' 56 0 second half 0 0
501 502 G-0502 WC-1958 1958 FIFA World Cup M-1958-15 Soviet Union v Austria 1958-06-11 group stage Group 4 T-70 ... 0 T-70 Soviet Union SUN 15' 15 0 first half 0 0
502 503 G-0503 WC-1958 1958 FIFA World Cup M-1958-15 Soviet Union v Austria 1958-06-11 group stage Group 4 T-70 ... 0 T-70 Soviet Union SUN 62' 62 0 second half 0 0
542 543 G-0543 WC-1958 1958 FIFA World Cup M-1958-27 Soviet Union v England 1958-06-17 group stage Group 4 T-70 ... 0 T-70 Soviet Union SUN 69' 69 0 second half 0 0

5 rows × 27 columns

In [42]:
unify_teams_names(goals, ['player_team_name', 'team_name'])
unify_teams_codes(goals, [
    ('player_team_name', 'player_team_code'),
    ('team_name', 'team_code')
])
unify_teams_ids(goals, [
    ('player_team_name', 'player_team_id'),
    ('team_name', 'team_id')
])
unify_matches_names(goals, ['match_name'])
check_all_na(goals)
goals[goals['player_team_name'] == 'Russia'].head()
Out[42]:
key_id goal_id tournament_id tournament_name match_id match_name match_date stage_name group_name team_id ... shirt_number player_team_id player_team_name player_team_code minute_label minute_regulation minute_stoppage match_period own_goal penalty
477 478 G-0478 WC-1958 1958 FIFA World Cup M-1958-08 Russia v England 1958-06-08 group stage Group 4 T-60 ... 0 T-60 Russia RUS 13' 13 0 first half 0 0
478 479 G-0479 WC-1958 1958 FIFA World Cup M-1958-08 Russia v England 1958-06-08 group stage Group 4 T-60 ... 0 T-60 Russia RUS 56' 56 0 second half 0 0
501 502 G-0502 WC-1958 1958 FIFA World Cup M-1958-15 Russia v Austria 1958-06-11 group stage Group 4 T-60 ... 0 T-60 Russia RUS 15' 15 0 first half 0 0
502 503 G-0503 WC-1958 1958 FIFA World Cup M-1958-15 Russia v Austria 1958-06-11 group stage Group 4 T-60 ... 0 T-60 Russia RUS 62' 62 0 second half 0 0
542 543 G-0543 WC-1958 1958 FIFA World Cup M-1958-27 Russia v England 1958-06-17 group stage Group 4 T-60 ... 0 T-60 Russia RUS 69' 69 0 second half 0 0

5 rows × 27 columns

Features Engineering¶

1.total goals in a match¶

ملاحظة: من الخاطئ أن يتم أخذ الأهداف المسجلة في ركلات الترجيح بعين الاعتبار وذلك حسب قواعد كرة القدم
In [43]:
matches['total_goals_in_match'] =\
    matches['home_team_score'] + \
    matches['away_team_score']

matches[['total_goals_in_match', 'home_team_score', 'away_team_score']].head()
Out[43]:
total_goals_in_match home_team_score away_team_score
0 5 4 1
1 3 3 0
2 3 2 1
3 4 3 1
4 1 1 0

2.host for a match¶

In [44]:
matches['host_for_match'] = (matches['host_country'] == matches['home_team_name']) \
    | (matches['host_country'] == matches['away_team_name'])

check_na(matches, 'host_for_match')

matches[
    ['host_for_match', 'host_country', 'home_team_name', 'away_team_name']
][matches['host_country'] == 'Qatar'].head()
Out[44]:
host_for_match host_country home_team_name away_team_name
900 True Qatar Qatar Ecuador
901 False Qatar England Iran
902 False Qatar Senegal Netherlands
903 False Qatar United States Wales
904 False Qatar Argentina Saudi Arabia

3.used capacity ratio¶

In [45]:
matches['used_capacity_ratio'] = matches['Attendance'] / matches['stadium_capacity']
matches[['home_team_name', 'away_team_name', 'Date', 'stadium_id',
         'used_capacity_ratio', 'Attendance', 'stadium_capacity']].head()
Out[45]:
home_team_name away_team_name Date stadium_id used_capacity_ratio Attendance stadium_capacity
0 France Mexico 7/13/1930 S-193 0.44440 4444 10000
1 United States Belgium 7/13/1930 S-192 0.91730 18346 20000
2 Serbia Brazil 7/14/1930 S-192 1.20295 24059 20000
3 Romania Peru 7/14/1930 S-193 0.25490 2549 10000
4 Argentina France 7/15/1930 S-192 1.17045 23409 20000

4¶

attendance discretization¶

سيتم تقسيم الأقسام بحيث يكون عدد الأسطر في كل قسم متساوي

لم يتم استخدام المتوسط الحسابي تجنباً للتأثير القيم المتطرفة

In [46]:
matches['Attendance'].quantile([.33, .66])
Out[46]:
0.33    35848.33
0.66    51334.08
Name: Attendance, dtype: float64
In [47]:
labels = ['low', 'medium', 'high']
transformer = FunctionTransformer(pd.qcut, kw_args={'q': 3, 'labels': labels})
matches['attendance_category'] = transformer.fit_transform(matches['Attendance'])
matches[['attendance_category', 'Attendance']].tail()
Out[47]:
attendance_category Attendance
959 high 68895
960 high 88966
961 high 68294
962 medium 44137
963 high 88966
In [48]:
matches['attendance_category'].value_counts()
Out[48]:
low       327
medium    322
high      315
Name: attendance_category, dtype: int64

used capacity ratio discretization¶

In [49]:
matches['used_capacity_ratio'].quantile([.33, .66])
Out[49]:
0.33    0.813620
0.66    0.975304
Name: used_capacity_ratio, dtype: float64
In [50]:
labels = ['low', 'medium', 'high']
transformer = FunctionTransformer(pd.qcut, kw_args={'q': 3, 'labels': labels})
matches['relative_attendance_category'] = transformer.fit_transform(
    matches['used_capacity_ratio']
)

matches[['relative_attendance_category', 'used_capacity_ratio']].tail()
Out[50]:
relative_attendance_category used_capacity_ratio
959 high 0.998478
960 high 0.999618
961 high 0.989768
962 medium 0.959500
963 high 0.999618
In [51]:
matches['relative_attendance_category'].value_counts()
Out[51]:
low       322
medium    321
high      321
Name: relative_attendance_category, dtype: int64

5.host country code¶

في البداية تم ملاحظة مشكلة في كأس العالم 2002، حيث تستضيفه دولتان وتم ذكر اسم كوريا وفي الواقع فإن كوريا الجنوبية هي من كانت المستضيفة إلى جانب اليابان

In [52]:
matches_hosts = matches['host_country'].str.replace('Korea, Japan', 'South Korea, Japan')
matches_hosts[matches_hosts == 'South Korea, Japan'].head()
Out[52]:
580    South Korea, Japan
581    South Korea, Japan
582    South Korea, Japan
583    South Korea, Japan
584    South Korea, Japan
Name: host_country, dtype: object
In [53]:
matches['host_country_code'] = matches_hosts.apply(
    lambda x: ', '.join([teams_codes[country] for country in x.split(', ')])
)

check_na(matches, 'host_country_code')

matches[
    ['host_country', 'host_country_code', 'home_team_name', 'away_team_name']
][matches['host_country'] == 'Korea, Japan'].head()
Out[53]:
host_country host_country_code home_team_name away_team_name
580 Korea, Japan KOR, JPN France Senegal
581 Korea, Japan KOR, JPN Republic of Ireland Cameroon
582 Korea, Japan KOR, JPN Uruguay Denmark
583 Korea, Japan KOR, JPN Germany Saudi Arabia
584 Korea, Japan KOR, JPN Argentina Nigeria

6.tournament year¶

In [54]:
matches['tournament_year'] = matches['tournament_name'].apply(lambda x: int(x[:4]))
matches[['tournament_year', 'tournament_name']].head()
Out[54]:
tournament_year tournament_name
0 1930 1930 FIFA World Cup
1 1930 1930 FIFA World Cup
2 1930 1930 FIFA World Cup
3 1930 1930 FIFA World Cup
4 1930 1930 FIFA World Cup
In [55]:
goals['tournament_year'] = goals['tournament_name'].apply(lambda x: int(x[:4]))
goals[['tournament_year', 'tournament_name']].head()
Out[55]:
tournament_year tournament_name
0 1930 1930 FIFA World Cup
1 1930 1930 FIFA World Cup
2 1930 1930 FIFA World Cup
3 1930 1930 FIFA World Cup
4 1930 1930 FIFA World Cup

7.full name¶

In [56]:
given_names = players_teams[['given_name']].value_counts()
display(given_names.head())
given_names.tail()
given_name    
not applicable    385
Carlos             75
José               61
Luis               58
Mario              49
dtype: int64
Out[56]:
given_name
Leongino      1
Demy          1
Leonard       1
Denny         1
N'Golo        1
dtype: int64
In [57]:
family_names = players_teams[['family_name']].value_counts()
display(family_names.head())
family_names.tail()
family_name
Kim            38
Rodríguez      29
Lee            28
González       25
Silva          18
dtype: int64
Out[57]:
family_name
Graafland      1
Goyzueta       1
Goycochea      1
Govou          1
Kundé          1
dtype: int64
In [58]:
def generate_full_name(data, given_name_col='given_name', family_name_col='family_name', full_name_col='full_name'):
    mask = data[given_name_col] != 'not applicable'
    first_name = data[given_name_col].where(mask, '')
    data[full_name_col] = first_name + ' ' + data[family_name_col]
    check_na(data, full_name_col)
In [59]:
generate_full_name(players_teams)
players_teams[['given_name', 'family_name', 'full_name']].tail()
Out[59]:
given_name family_name full_name
8480 Landry N'Guémo Landry N'Guémo
8481 Ramiro Castillo Ramiro Castillo
8482 Jonás Gutiérrez Jonás Gutiérrez
8483 not applicable Rivaldo Rivaldo
8484 Giorgian De Arrascaeta Giorgian De Arrascaeta
In [60]:
generate_full_name(goals)
goals[['given_name', 'family_name', 'full_name']].tail()
Out[60]:
given_name family_name full_name
2715 Ángel Di María Ángel Di María
2716 Kylian Mbappé Kylian Mbappé
2717 Kylian Mbappé Kylian Mbappé
2718 Lionel Messi Lionel Messi
2719 Kylian Mbappé Kylian Mbappé

8.short stage name¶

In [61]:
matches['stage_name'].value_counts()
Out[61]:
group stage           676
round of 16            97
quarter-finals         70
semi-finals            38
second group stage     36
final                  21
third-place match      20
final round             6
Name: stage_name, dtype: int64

يوجد دوري مجموعات والباقي تصنف ضمن دور خروج المغلوب

In [62]:
matches['short_stage_name'] = matches['stage_name'].apply(
    lambda x: 'G' if 'group' in x else 'K'
)
matches[['short_stage_name', 'stage_name']][50:55]
Out[62]:
short_stage_name stage_name
50 K semi-finals
51 K third-place match
52 K final
53 G group stage
54 G group stage

9.winner code¶

In [63]:
tournaments['winner_code'] = tournaments['winner'].map(teams_codes)
check_na(tournaments, 'winner_code')
tournaments[['tournament_name', 'winner', 'winner_code']].tail()
Out[63]:
tournament_name winner winner_code
17 2006 FIFA World Cup Italy ITA
18 2010 FIFA World Cup Spain ESP
19 2014 FIFA World Cup Germany DEU
20 2018 FIFA World Cup France FRA
21 2022 FIFA World Cup Argentina ARG

10.late goal¶

In [64]:
set(goals['match_period'].values)
Out[64]:
{'extra time, first half',
 'extra time, first half, stoppage time',
 'extra time, second half',
 'extra time, second half, stoppage time',
 'first half',
 'first half, stoppage time',
 'second half',
 'second half, stoppage time'}

سيتم اعتبار الهدف المتأخر على أنه الهدف الذي يأتي بأخر 5 دقائق من كل شوط

In [65]:
periods_endings = {
    'first half': 45,
    'first half, stoppage time': 45,
    'second half': 90,
    'second half, stoppage time': 90,
    'extra time, first half': 105,
    'extra time, first half, stoppage time': 105,
    'extra time, second half': 120,
    'extra time, second half, stoppage time': 120
}


goals['late_goal'] = goals[['match_period', 'minute_regulation']].apply(
    lambda x:  x['minute_regulation'] >= periods_endings[x['match_period']] - 5,
    axis=1
)

goals[goals['minute_regulation'] > 89][['late_goal', 'match_period', 'minute_regulation']][:10]
Out[65]:
late_goal match_period minute_regulation
72 False extra time, first half 93
73 False extra time, second half 109
74 True extra time, second half 116
98 True second half 90
139 False extra time, first half 95
146 True extra time, first half 103
147 True extra time, first half 105
160 False extra time, first half 94
169 False extra time, first half 93
170 True extra time, first half 104

Exploration and Analysis¶

A¶

Attendance¶

In [66]:
attendance_in_year = matches.groupby(['tournament_year'])['Attendance']
tournament_years = matches['tournament_year'].unique()

fig = px.line(matches, x=tournament_years, y=attendance_in_year.mean())

fig.add_scatter(x=tournament_years, y=attendance_in_year.mean(), name='Mean')
fig.add_scatter(x=tournament_years, y=attendance_in_year.median(), name='Median')

fig.update_layout(width=716, height=350)
fig.show()
In [67]:
matches.loc[matches['tournament_year'] == 1994]['host_country'].unique()
Out[67]:
array(['United States'], dtype=object)
نلاحظ أنّ بطولة 1994 سجلت أعلى عدد حضور وأعلى متوسط لهم، في عام 1950 كان هناك فرق كبير بين الوسيط والمتوسط، وهذا يدل على عدم توزع الحضور بشكل منتظم على المباريات ووجود بعض القيم المتطرفة ذات الحضور القليل أو الكبير جدا كذلك الأمر بالنسبة لـ 1986 ولك بدرجة أقل
In [68]:
fig = go.Figure(
    data=go.Histogram(
        x=matches['Attendance'], histfunc='count', xbins={'size': 10000}
    )
)

fig.update_layout(
    xaxis_title='attendance', yaxis_title='count',
    width=600, height=400
)

fig.show()
In [69]:
matches.loc[matches['Attendance'] > 130000][
    ['match_name', 'tournament_year', 'Attendance', 'stage_name', 'host_country',
        'used_capacity_ratio', 'stadium_name', 'stadium_capacity']
].sort_values('Attendance', ascending=False)
Out[69]:
match_name tournament_year Attendance stage_name host_country used_capacity_ratio stadium_name stadium_capacity
74 Uruguay v Brazil 1950 173850 final round Brazil 0.869250 Estádio do Maracanã 200000
71 Brazil v Spain 1950 152772 final round Brazil 0.763860 Estádio do Maracanã 200000
63 Brazil v Yugoslavia 1950 142429 group stage Brazil 0.712145 Estádio do Maracanã 200000
69 Brazil v Sweden 1950 138886 final round Brazil 0.694430 Estádio do Maracanã 200000
In [70]:
stadiums[
    ['stadium_name', 'stadium_capacity']
].sort_values('stadium_capacity', ascending=False).head()
Out[70]:
stadium_name stadium_capacity
19 Estádio do Maracanã 200000
148 Camp Nou 121000
98 Estadio Azteca 115000
29 Wembley Stadium 99000
186 Rose Bowl 94000
نلاحظ أنّ عدد الحضور الغالب ما بين 30 ألف و50 ألف ،ولكن سجل عدد الحضور قيم متطرفة ما بين 130 ألف و 175 ألف وذلك عام 1950 سبب ذلك هو كونه كان أكبر ملعب في العالم قبل أن يتم تقليص حجمه
In [71]:
fig = px.box(matches, y="Attendance", x="tournament_year", width=600, height=400)
fig.show()
نلاحظ أنّ أكثر مباراة حققت نسبة حضورعالميا كانت ب1950 بعدد حوالي 173 ألف وكما ذكرنا في عام 1950 و 1986 فهناك قيم متطرفة،أيضاً في 2018 كان هناك قيم متطرفة كبيرة

في عام 1970 كان التوزع منتظماً، والوسيط أقرب ما يمكن للمتوسط في 1950 و 1960 و 2010 و 2022، نلاحظ الفرق الكبير بين الوسيط و q3

نستنتج من كل ما سبق أنه في الأغلب البطولات توحد قيم متطرفة والتي غالباً ما تمثل نهائيات البطولة

B¶

1. goals minutes median per tournament¶

سيتم إجراء الحسابات وفق الدقائق مع الوقت بدل الضائع

In [72]:
goals['minute_regulation_with_stoppage'] = goals['minute_regulation']\
    + goals['minute_stoppage']
In [73]:
goals_by_tournament = goals.groupby('tournament_year')
minutes_meds = goals_by_tournament['minute_regulation_with_stoppage'].median()

fig = px.bar(
    x=tournament_years, y=minutes_meds,
    labels={'x': 'tournament years', 'y': 'goals minutes median'},
    width=600, height=330
)
fig.show()
In [74]:
print(minutes_meds.count())
(minutes_meds <= 60).sum()
22
Out[74]:
21

هناك نسختي كأس عالم 1942 و 1946 تم إلغاؤهما بسبب الحرب العالمية

متوسط دقائق الأهداف قريب من متوسط دقائق المباراة أي 45 دقيقة وفقط قيمة واحدة تتجاوز الـ 60 دقيقة

2. goals per match histogram¶

In [75]:
fig = px.histogram(
    matches, x="total_goals_in_match",
    color='total_goals_in_match',
    histfunc='count',
    width=600, height=400
)
fig.show()

معظم المباريات تنتهي بهدف أو اثنين أو ثلاثة

3. goals and periods modes¶

ملاحظة: عند أخذ الدقائق دون احتساب الوقت بدل الضائع فمن المنطقي أن 45 و 90 سيكونا من القيم الأكثر تكراراً

In [76]:
goals['minute_regulation'].value_counts().head()
Out[76]:
90    111
45     50
75     47
73     45
18     44
Name: minute_regulation, dtype: int64
In [77]:
goals_per_tournament = goals[
    ['tournament_year', 'minute_regulation_with_stoppage', 'match_period']
].groupby('tournament_year', as_index=False).agg(pd.Series.mode)

goals_per_tournament
Out[77]:
tournament_year minute_regulation_with_stoppage match_period
0 1930 65 second half
1 1934 [18, 29] [first half, second half]
2 1938 [35, 44, 89] first half
3 1950 17 second half
4 1954 [18, 30, 54, 60, 69, 78, 84, 85] second half
5 1958 [4, 18, 24, 32, 44, 52, 55] second half
6 1962 [17, 29, 56, 73, 90] second half
7 1966 [15, 43, 75, 88] second half
8 1970 76 second half
9 1974 18 second half
10 1978 [43, 45] first half
11 1982 [68, 75, 83] second half
12 1986 62 second half
13 1990 81 second half
14 1994 90 second half
15 1998 46 second half
16 2002 88 second half
17 2006 [6, 84] first half
18 2010 89 second half
19 2014 82 second half
20 2018 51 second half
21 2022 48 second half

النتيجة منطقية، بسبب أن الأشواط الإضافية أقل وقتا ولا يتم دائماً اللجوء لها بالإضافة ووفقا لمخطط وسيط دقيقة الهدف في البطولة فإن جميع القيم أعلى من 45

4. late goals per tournament histogram¶

In [78]:
late_goals_per_tour = goals_by_tournament['late_goal'].sum()

fig = go.Figure(
    data=go.Histogram(
        x=late_goals_per_tour, histfunc='count', xbins={'size': 1}
    )
)

fig.update_layout(
    xaxis_title='late goals per tournament', yaxis_title='count',
    width=600, height=400
)
fig.show()

لا يوجد تكرارات كثيرة لعدد الأهداف المتأخرة في البطولات بالإضافة إلى امتداد القيم على مجال واسع نسبياً من 8 إلى 42

5. top 12 scorers at all¶

In [79]:
goals_scorers = goals[[
    'player_id', 'full_name', 'goal_id'
]].groupby('player_id', as_index=False).agg({
    'full_name': 'first',
    'goal_id': 'count'
}).rename(columns={
    'goal_id': 'goals_counter'
}).sort_values(by='goals_counter', ascending=False)

goals_scorers.head()
Out[79]:
player_id full_name goals_counter
775 P-05224 Miroslav Klose 16
1246 P-08490 Ronaldo 15
325 P-02173 Gerd Müller 14
377 P-02537 Just Fontaine 13
517 P-03429 Lionel Messi 13
In [80]:
fig = px.bar(
    goals_scorers[:12], x='full_name', y='goals_counter',
    width=600, height=400
)
fig.show()

نستنتج شدة التنافس بين اللاعبين على لقب الهداف التاريخي لكأس العالم (وعدم وجود قيم متطرفة)، وأن كليان مبابي سيكون لديه فرصة كبيرة لكسر الرقم القياسي لكلوزة كونه لا يزال بعمر 24 ولديه فرصة للعب كأسين عالم على الأقل

6. top scorers per tournament¶

ملاحظة: تم الأخذ بعين الاعتبار أن يكون للبطولة أكثر من هداف كما حدث في 2010

In [81]:
scorers_per_tournament = goals[[
    'player_id', 'full_name', 'tournament_year', 'goal_id'
]].groupby(['player_id', 'tournament_year'], as_index=False).agg({
    'full_name': 'first',
    'goal_id': 'count'
}).rename(columns={
    'goal_id': 'goals_counter'
}).sort_values(by='goals_counter', ascending=False)

scorers_per_tournament[scorers_per_tournament['tournament_year'] == 2022].head()
Out[81]:
player_id tournament_year full_name goals_counter
1237 P-06978 2022 Kylian Mbappé 8
605 P-03429 2022 Lionel Messi 7
995 P-05731 2022 Julián Álvarez 4
1293 P-07287 2022 Olivier Giroud 4
244 P-01503 2022 Enner Valencia 3
In [82]:
# finding who achieve goals equal to the max value in each tournament
top_scorers_per_tournament_idx = scorers_per_tournament.groupby(
    'tournament_year'
)['goals_counter'].transform(max) == scorers_per_tournament['goals_counter']

# grouping by tournament_year
top_scorers_per_tournament = scorers_per_tournament[
    top_scorers_per_tournament_idx
].groupby('tournament_year', as_index=False).agg(
    {
        'full_name': ', '.join,
        'goals_counter': 'first'
    }
).sort_values(by='tournament_year', ascending=False)

top_scorers_per_tournament.head()
Out[82]:
tournament_year full_name goals_counter
21 2022 Kylian Mbappé 8
20 2018 Harry Kane 6
19 2014 James Rodríguez 6
18 2010 Thomas Müller, Wesley Sneijder, David Villa, D... 5
17 2006 Miroslav Klose 5
In [83]:
fig = px.bar(
    top_scorers_per_tournament,
    x='tournament_year', y='goals_counter', hover_data='full_name',
    width=600, height=400
)

fig.show()

نلاحظ وجود قيمة متطرفة كبيرة في عام 1958 من قبل اللاعب just fontaine

وبالمقارنة مع مخطط الهداف التاريخي لكأس العالم فإن هذا اللاعب لم يسجل سوى في هذه النسخة من البطولة

معظم قيم الأهداف في كل نسخة تتراوح بين 6 و 8

7. goal per tournament¶

In [84]:
goals_per_tournament = goals[[
    'tournament_year', 'goal_id'
]].groupby('tournament_year', as_index=False).agg({
    'goal_id': 'count'
}).rename(columns={
    'goal_id': 'goals_counter'
})

goals_per_tournament.head()
Out[84]:
tournament_year goals_counter
0 1930 70
1 1934 70
2 1938 84
3 1950 88
4 1954 140
In [85]:
fig = px.bar(
    goals_per_tournament,
    x='tournament_year', y='goals_counter',
    width=600, height=400
)

fig.show()

نلاحظ ارتفاع عددالأهداف المسجلة في كل بطولة مع مرور الزمن السبب الرئيسي لهذه الظاهرة هو ازدياد عدد المباريات كما سنرى في الطلب الرابع

8. Brazil, Germany and Italy goals info¶

In [86]:
goals_stages = goals[[
    'player_team_name', 'minute_regulation_with_stoppage', 'match_id']
].merge(
    matches[['match_id', 'stage_name']], on='match_id'
)

BGI_info = goals_stages[
    goals_stages['player_team_name'].isin(['Brazil', 'Germany', 'Italy'])
]

BGI_info[28:33]
Out[86]:
player_team_name minute_regulation_with_stoppage match_id stage_name
139 Italy 95 M-1934-17 final
140 Germany 29 M-1938-01 round of 16
158 Italy 2 M-1938-05 round of 16
160 Italy 94 M-1938-05 round of 16
161 Brazil 18 M-1938-06 round of 16
In [87]:
fig = px.strip(
    BGI_info, y='minute_regulation_with_stoppage',
    x='stage_name', color='player_team_name'
)
fig.show()
  • الأهداف تقل كل ما تم التقدم بمراحل البطولة
  • البرازيل هي صاحبة عدد الأهداف الأكبر
  • فقط ألمانيا من بين الثلاث دول من استطاع التسجيل في الشوط الإضافي الثاني في النهائي

وهو هدف ماريو غوتزه الذي جلب اللقب عام 2014

  • يتركز عدد كبير من أهداف ألمانيا بين الدقيقة 60 و 90
  • في دور الـ 16 ونصف النهائي كانت معظم أهداف إيطاليا ليست ضمن منتصف المباراة، أي ليست ضمن الدقائق من 25 إلى 80

C¶

1. matches frequency¶

تم أخذ الترتيب الأبجدي للفريقين لتجنب التكرار ما بين ذهاب وإياب

In [88]:
matches['match_name'] = matches[['home_team', 'away_team']].apply(
    lambda x: ' v '.join(sorted(list(x))), axis=1
)

check_na(matches, 'match_name')

matches[['home_team', 'away_team', 'match_name']][
    (matches['home_team'] == 'Brazil') | (matches['away_team'] == 'Brazil')
].head()
Out[88]:
home_team away_team match_name
2 Serbia Brazil Brazil v Serbia
11 Brazil Bolivia Bolivia v Brazil
23 Spain Brazil Brazil v Spain
40 Brazil Poland Brazil v Poland
44 Brazil Czech Republic Brazil v Czech Republic
In [89]:
matches_frequency = matches[
    ['match_name', 'match_id']
].groupby('match_name', as_index=False).count(
).sort_values(
    by='match_id', ascending=False
).rename(columns={'match_id': 'matches_frequency'})

matches_frequency.head()
Out[89]:
match_name matches_frequency
389 Germany v Serbia 7
28 Argentina v Germany 7
155 Brazil v Sweden 7
39 Argentina v Netherlands 6
152 Brazil v Serbia 6

2. frequency visualization¶

In [90]:
fig = px.bar(
    matches_frequency[:10],
    x='match_name', y='matches_frequency',
    width=600, height=400
)

fig.show()

D¶

1.players played more than one time¶

In [91]:
players_for_more_than_2_team = players_teams.loc[players_teams['teams_num'] > 1]
players_for_more_than_2_team.shape
Out[91]:
(18, 10)
In [92]:
players_for_more_than_2_team.sort_values(
    by='player_id'
)[
    ['player_id', 'full_name', 'team_name', 'teams_num', 'list_tournaments']
]
Out[92]:
player_id full_name team_name teams_num list_tournaments
455 P-00537 Franz Wagner Austria, Germany 2 1934, 1938
1080 P-01259 Ferenc Puskás Hungary, Spain 2 1954, 1962
1298 P-01512 José Altafini Brazil, Italy 2 1958, 1962
1490 P-01739 Davor Šuker Serbia, Croatia, Croatia 2 1990, 1998, 2002
1502 P-01757 Rudolf Raftl Austria, Germany 2 1934, 1938
2016 P-02369 Dejan Stanković Serbia, Serbia and Montenegro, Serbia 2 1998, 2006, 2010
2129 P-02502 Nikola Žigić Serbia and Montenegro, Serbia 2 2006, 2010
2163 P-02543 Luis Monti Argentina, Italy 2 1930, 1934
2191 P-02576 Robert Prosinečki Serbia, Croatia, Croatia 2 1990, 1998, 2002
2416 P-02845 Attilio Demaría Argentina, Italy 2 1930, 1934
2725 P-03207 Alen Bokšić Serbia, Croatia 2 1990, 2002
3900 P-04605 Robert Jarni Serbia, Croatia, Croatia 2 1990, 1998, 2002
4580 P-05394 José Santamaría Uruguay, Spain 2 1954, 1962
5355 P-06299 Nemanja Vidić Serbia and Montenegro, Serbia 2 2006, 2010
7446 P-08793 Vladimir Stojković Serbia and Montenegro, Serbia, Serbia 2 2006, 2010, 2018
7597 P-08975 Josef Stroh Austria, Germany 2 1934, 1938
7867 P-09293 Savo Milošević Serbia, Serbia and Montenegro 2 1998, 2006
8170 P-09633 Willibald Schmaus Austria, Germany 2 1934, 1938
بعد ضم النمسا لألمانيا عام 1938 شارك بعض لاعبو النمسا بفريق ألمانيا، أما بالنسبة للاعبين المشاركين بفريق صربيا وصربيا والجبل الأسود فهو ذات فريق بلدهم ولكن تم انفصال صربيا عن الجبل الأسود عام 2006

أيضاً هناك بعض اللاعبين الذين يملكون عدة جنسيات مثل Luis Monti وهو اللاعب الوحيد الذي فاز بكأس العالم مع بلدين مختلفين وبالتالي كان في الماضي يحق للاعب تمثيل أي منتخب يحمل جنسيته ولكن في 2004 قامت الفيفا بإصدار قوانين للحد من عمليات التجنيس لأجل تمثيل منتخب أخر

cramers_V¶

In [93]:
def cramers_v(df, col1, col2):
    
    cross_tabs = pd.crosstab(df[col1], df[col2])
    # getting the chi sq. stat
    chi2 = chi2_contingency(cross_tabs)[0]
    # calculating the total number of observations
    n = cross_tabs.sum().sum()
    # getting the degrees of freedom
    dof = min(cross_tabs.shape)-1
    # calculating cramer's v
    v = np.sqrt(chi2/(n*dof))
    # printing results
    print(f'V = {v}')
    print(f'Cramer\'s V Degrees of Freedom = {dof}')
In [94]:
def chi_sq_test(df, col1, col2):
    cross_tabs = pd.crosstab(df[col1], df[col2])
    chi2, p, dof, con_table = chi2_contingency(cross_tabs)
    print(f'chi-squared = {chi2}\np value= {p}\ndegrees of freedom = {dof}')

2.relation between country host and winner¶

In [95]:
chi_sq_test(tournaments, 'host_country', 'winner')
chi-squared = 120.63333333333335
p value= 0.2719475008128004
degrees of freedom = 112
In [96]:
print("the number of tournaments where the host won: ",
      tournaments.loc[tournaments['host_won'] == 1].shape[0])
print("the number of all tournamets: ", tournaments.shape[0])
the number of tournaments where the host won:  6
the number of all tournamets:  22
حسب قيمة p لا يوجد علاقة بينهما وفوز 6 منتخبات مضيفة من أصل 22 يؤكد ذلك

3.relation between host match and relative attendance category¶

In [97]:
chi_sq_test(matches, 'host_for_match', 'relative_attendance_category')
chi-squared = 6.499723567042024
p value= 0.038779567436588364
degrees of freedom = 2
In [98]:
cramers_v(matches, 'host_for_match', 'relative_attendance_category')
# cramers_v(matches,'host_for_match','used_capacity_ratio')
V = 0.08211243409515483
Cramer's V Degrees of Freedom = 1
حسب قيمة p نجد أنّ هناك ارتباط بين مباراة المضيف وفئة نسبة الحضور ولكن ضعيف وممكن تفسير ذلك بأنه ساكني البلد المضيف من أسهل لهم حضور مباراة منتخب بلدهم من الشعوب الأخرى

4.relation bettween host country and Attendance category¶

In [99]:
chi_sq_test(matches, 'host_country', 'attendance_category')
chi-squared = 366.43607721193973
p value= 1.9692874526706663e-58
degrees of freedom = 32
حسب قيمة p نجد أنّ هناك ارتباط بين الدولة المضيفة وفئة الحضور وممكن تفسير ذلك تبعا لحجم ملاعب الدولة المضيفة
In [100]:
cramers_v(matches, 'host_country', 'attendance_category')
V = 0.43595894992853435
Cramer's V Degrees of Freedom = 2

Secret Mission¶

عددالمباريات في كل بطولة

In [101]:
matches_per_tournament = matches[
    ['tournament_year', 'match_id']
].groupby(
    'tournament_year', as_index=False).count(
).rename(columns={'match_id': 'matches_counter'})

fig = px.bar(
    matches_per_tournament,
    x='tournament_year', y='matches_counter',
    width=600, height=400
)
fig.show()

النتيجة: ازدياد عدد المباريات عبر الزمن

أكثر 12 لاعب لعباً للمباريات عبر تاريخ كأس العالم

In [102]:
player_appearances = read_data('worldcup-1.1.0/data-csv/player_appearances.csv')
generate_full_name(player_appearances)
check_all_na(player_appearances)
player_appearances[['match_id', 'full_name']].tail()
Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
       'match_date', 'stage_name', 'group_name', 'team_id', 'team_name',
       'team_code', 'home_team', 'away_team', 'player_id', 'family_name',
       'given_name', 'shirt_number', 'position_name', 'position_code',
       'starter', 'substitute', 'captain'],
      dtype='object')

the shape is:  (20618, 22)
Out[102]:
match_id full_name
20613 M-2022-64 Youssouf Fofana
20614 M-2022-64 Kingsley Coman
20615 M-2022-64 Ibrahima Konaté
20616 M-2022-64 Eduardo Camavinga
20617 M-2022-64 Marcus Thuram
In [103]:
appearances = player_appearances[
    ['player_id', 'match_id', 'full_name']
].groupby('player_id', as_index=False).agg({
    'full_name': 'first',
    'match_id': 'count'
}).rename(columns={
    'match_id': 'appearances_counter'
}).sort_values(by='appearances_counter', ascending=False)

appearances.head()
Out[103]:
player_id full_name appearances_counter
1774 P-03429 Lionel Messi 26
4829 P-09502 Lothar Matthäus 25
2680 P-05224 Miroslav Klose 24
2644 P-05132 Paolo Maldini 23
1622 P-03142 Cristiano Ronaldo 22
In [104]:
fig = px.bar(
    appearances[:10], x='full_name', y='appearances_counter',
    width=600, height=400
)
fig.show()

إحصائية مهمة هي المعدل التهديفي للاعب، إذ لا يتم فقط الاهتمام بعدد الأهداف لكل لاعب بل يتم ربطه بعدد المباريات التي لعبها اللاعب

In [105]:
scorers_with_appearances = appearances.merge(
    goals_scorers[['player_id', 'goals_counter']], on='player_id'
)

scorers_with_appearances['goals_ratio'] = scorers_with_appearances['goals_counter'] / \
    scorers_with_appearances['appearances_counter']

scorers_with_appearances.sort_values('goals_ratio', ascending=False, inplace=True)
scorers_with_appearances.head()
Out[105]:
player_id full_name appearances_counter goals_counter goals_ratio
1089 P-08217 Helmut Haller 1 6 6.000000
997 P-04183 Oleg Salenko 3 6 2.000000
420 P-09516 Pelé 6 12 2.000000
969 P-04680 Geoff Hurst 3 5 1.666667
432 P-02871 Uwe Seeler 6 9 1.500000

سنقوم بعرض المعدلات التهديفية للاعبين الذي يملكون على الأقل 10 أهداف وذلك للمقارنة بين الهدافين التاريخيين

In [106]:
scorers12_with_appearances = scorers_with_appearances[
    scorers_with_appearances['goals_counter'] >= 10
]
In [107]:
fig = px.bar(
    scorers12_with_appearances[:12],
    x='full_name', y='goals_ratio', hover_data=['goals_counter', 'appearances_counter'],
    width=600, height=400
)
fig.show()

نلاحظ المعدل التهديفي الرهيب لبيليه والذي يشكل قيمة متطرفة، وعلى سبيل المثال رغم أن ميسي من الهدافين التاريخين ويتفوق على عدة لاعبين مثل مبابي من ناحية عدد الأهداف، إلا أن معدله التهديفي أقل

المنتخبات صاحبة أعلى أهداف

In [108]:
teams_goals = goals[[
    'team_id', 'team_name', 'goal_id'
]].groupby('team_id', as_index=False).agg({
    'team_name': 'first',
    'goal_id': 'count'
}).rename(columns={
    'goal_id': 'goals_counter'
}).sort_values('goals_counter', ascending=False)

teams_goals.head()
Out[108]:
team_id team_name goals_counter
8 T-09 Brazil 237
25 T-29 Germany 232
2 T-03 Argentina 152
24 T-28 France 136
35 T-39 Italy 128

المنتخبات صاحبة أعلى عدد من المباريات في كأس العالم

In [109]:
team_home_matches = matches[
    ['home_team_id', 'match_id']
].groupby(
    'home_team_id', as_index=False
).count().rename(columns={'home_team_id': 'team_id'})

team_away_matches = matches[
    ['away_team_id', 'match_id']
].groupby(
    'away_team_id', as_index=False
).count().rename(columns={'away_team_id': 'team_id'})

team_home_matches.head()
Out[109]:
team_id match_id
0 T-01 6
1 T-02 1
2 T-03 63
3 T-04 8
4 T-05 14
In [110]:
team_matches = team_home_matches.merge(team_away_matches, on='team_id')
team_matches['appearances_counter'] = team_matches['match_id_x'] + team_matches['match_id_y']
team_matches.drop(
    columns=['match_id_x', 'match_id_y'], inplace=True
)
team_matches.head()
Out[110]:
team_id appearances_counter
0 T-01 13
1 T-02 3
2 T-03 88
3 T-04 20
4 T-05 29
In [111]:
teams_goals_matches = team_matches.merge(
    teams_goals, on='team_id'
).sort_values('appearances_counter', ascending=False)
teams_goals_matches.head()
Out[111]:
team_id appearances_counter team_name goals_counter
8 T-09 114 Brazil 237
24 T-29 112 Germany 232
2 T-03 88 Argentina 152
33 T-39 83 Italy 128
22 T-27 74 England 104
In [112]:
fig = px.bar(
    teams_goals_matches[:12],
    x='team_name', y='appearances_counter', hover_data='goals_counter',
    width=600, height=400
)
fig.show()

المعدل التهديفي للفرق

In [113]:
teams_goals_matches['goals_ratio'] = teams_goals_matches['goals_counter'] / \
    teams_goals_matches['appearances_counter']

teams_goals_ratio = teams_goals_matches.sort_values('goals_ratio', ascending=False)
teams_goals_ratio.head()
Out[113]:
team_id appearances_counter team_name goals_counter goals_ratio
29 T-34 32 Hungary 87 2.718750
8 T-09 114 Brazil 237 2.078947
24 T-29 112 Germany 232 2.071429
68 T-77 10 Turkey 20 2.000000
23 T-28 73 France 136 1.863014
In [114]:
fig = px.bar(
    teams_goals_matches[:12],
    x='team_name', y='goals_ratio', hover_data=['goals_counter', 'appearances_counter'],
    width=600, height=400
)
fig.show()

المنتخبات التي حققت كأس العالم

In [115]:
tournaments_winners = tournaments[
    ['winner', 'tournament_id']
].groupby(
    'winner', as_index=False
).count().rename(columns={
    'tournament_id': 'winning_times'
}).sort_values('winning_times', ascending=False)

tournaments_winners
Out[115]:
winner winning_times
1 Brazil 5
4 Germany 4
5 Italy 4
0 Argentina 3
3 France 2
7 Uruguay 2
2 England 1
6 Spain 1

دراسة حالة الارتباط لعدد مرات بالبطولة مع كل من عدد المباريات والأهداف

In [116]:
teams_goals_with_wins = teams_goals_matches.merge(
    tournaments_winners.rename(columns={'winner': 'team_name'}),
    on='team_name', how='outer'
)

teams_goals_with_wins['winning_times'].fillna(0, inplace=True)
teams_goals_with_wins[:10]
Out[116]:
team_id appearances_counter team_name goals_counter goals_ratio winning_times
0 T-09 114 Brazil 237 2.078947 5.0
1 T-29 112 Germany 232 2.071429 4.0
2 T-03 88 Argentina 152 1.727273 3.0
3 T-39 83 Italy 128 1.542169 4.0
4 T-27 74 England 104 1.405405 1.0
5 T-28 73 France 136 1.863014 2.0
6 T-71 67 Spain 108 1.611940 1.0
7 T-44 60 Mexico 62 1.033333 0.0
8 T-81 59 Uruguay 89 1.508475 2.0
9 T-46 55 Netherlands 96 1.745455 0.0
In [117]:
teams_goals_with_wins.corr()
C:\Users\Sedra\AppData\Local\Temp\ipykernel_18428\1749428966.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Out[117]:
appearances_counter goals_counter goals_ratio winning_times
appearances_counter 1.000000 0.969362 0.647729 0.797015
goals_counter 0.969362 1.000000 0.708467 0.849483
goals_ratio 0.647729 0.708467 1.000000 0.439483
winning_times 0.797015 0.849483 0.439483 1.000000

بالتأكيد، فإن هناك ارتباط قوي بين الثلاث سمات مع بعضهم البعض، عدد المباريات وعدد الأهداف وعدد مرات التتويج